{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1d1f3076-6165-4fdf-836a-8f6de13cd365",
   "metadata": {},
   "source": [
    "### Feature annotation of inactive genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6061ab5a-c897-428f-a7b3-b338a7d66570",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import pysam\n",
    "import numpy as np\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "from functools import reduce\n",
    "import openpyxl\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "20cb75b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "### inputs \n",
    "# inactive genes list \n",
    "inactive_genes_path = wkdir_path.joinpath(\"2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt\")\n",
    "# TPM expression matrix \n",
    "tpm_mtx_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv\")\n",
    "# gencode .gtf\n",
    "gencode_path = wkdir_path.joinpath(\"reference/gencode/gencode.v31.annotation.sorted.gtf.gz\")\n",
    "# gnomad constraint information by transcript \n",
    "gnomad_constraint_path = wkdir_path.joinpath(\"reference/gnomad/gnomad.v2.1.1.lof_metrics.by_transcript.txt\")\n",
    "# pHaplo and pTriplo\n",
    "ptriplo_phaplo_path = wkdir_path.joinpath(\"reference/phaplo_ptriplo/1-s2.0-S0092867422007887-mmc7.xlsx\")\n",
    "# GERP elements\n",
    "gerp_elem_path = wkdir_path.joinpath(\"reference/conservation/gerp/gerpElements_hg38_multiz120Mammals.bed\")\n",
    "# Enhancer domain scores (EDS)\n",
    "eds_path = wkdir_path.joinpath(\"reference/eds/1-s2.0-S0002929720300124-mmc2.xlsx\")\n",
    "# Episcore\n",
    "episcore_path = wkdir_path.joinpath(\"reference/episcore/41467_2018_4552_MOESM5_ESM.xlsx\")\n",
    "# GTEx pass eQTL QC \n",
    "gtex_pass_eqtl_count_path = wkdir_path.joinpath(\"1_rna_seq_qc/gtex_pass_eqtl_qc/gene_pass_eqtl_count.csv\")\n",
    "# median gene TPM per tissue GTEx \n",
    "gtex_median_tpm_path = wkdir_path.joinpath(\"reference/gtex/median_tpm/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.tsv\")\n",
    "# GM12878 A/B compartments \n",
    "gm12878_ab_compartments_path = wkdir_path.joinpath(\"reference/4d_nucleome/gm12878_hi_c/compartments/4DNFILYQ1PAY.bg\")\n",
    "# gnomAD gene sets \n",
    "gnomad_gene_sets_path = wkdir_path.joinpath(\"reference/gnomad/supplement_2020/supplement/supplementary_dataset_13_gene_lists.tsv.gz\")\n",
    "# cosmic genes \n",
    "cosmic_genes_path = wkdir_path.joinpath(\"reference/cosmic_v97/cancer_gene_census.csv\")\n",
    "# OMIM genes \n",
    "omim_genes_path = wkdir_path.joinpath(\"reference/omim/genemap2.txt\")\n",
    "# TAD boundaries (GM12878 shared)\n",
    "tad_boundaries_path = wkdir_path.joinpath(\"reference/4d_nucleome/shared_boundaries/4DNFIVK5JOFU_imr90_huvec_hnek_hmec.bed\")\n",
    "# Open Targets approved \n",
    "approved_drug_targets_path=wkdir_path.joinpath(\"reference/opentargets/targets/open_targets_approved_drugs.txt\")\n",
    "# Decipher genes \n",
    "decipher_genes_path = wkdir_path.joinpath(\"reference/decipher/DDG2P.csv.gz\")\n",
    "# chromHMM PBMCs fraction \n",
    "chromhmm_fraction_path = wkdir_path.joinpath(\"2_misexp_qc/chrom_hmm/gene_chromhmm_state_overlap.csv\")\n",
    "# phyloP gene body \n",
    "phylop_gene_body_path=wkdir_path.joinpath(\"3_misexp_genes/phylop/inactive_gene_phylop_gene_body.tsv\")\n",
    "# output directory\n",
    "out_dir_path = wkdir_path.joinpath(\"3_misexp_genes\")\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "14c10867",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes: 8650\n",
      "INTERVAL, Number of gene IDs with median TPM > 0.5: 17418\n",
      "Overlap between median TPM > 0.5 and inactive gene set: 0\n"
     ]
    }
   ],
   "source": [
    "# load inactive genes \n",
    "inactive_gene_id_df = pd.read_csv(inactive_genes_path, sep=\"\\t\", header=None).rename(columns={0:\"gene_id\"})\n",
    "inactive_gene_id_set = set(inactive_gene_id_df.gene_id.unique())\n",
    "print(f\"Number of inactive genes: {len(inactive_gene_id_set)}\")\n",
    "gene_features_df = inactive_gene_id_df.copy()\n",
    "\n",
    "# get genes with median TPM > 0.5 \n",
    "tpm_mtx_df = pd.read_csv(tpm_mtx_path)\n",
    "tpm_mtx_idx_df = tpm_mtx_df.set_index(\"gene_id\")\n",
    "tpm_median_df = pd.DataFrame(tpm_mtx_idx_df.median(axis=1)).rename(columns={0:\"median_tpm\"}).reset_index()\n",
    "intrvl_median_tpm05_gene_ids = tpm_median_df[tpm_median_df.median_tpm > 0.5].gene_id.tolist()\n",
    "print(f\"INTERVAL, Number of gene IDs with median TPM > 0.5: {len(intrvl_median_tpm05_gene_ids)}\")\n",
    "# check no overlap active and inactive\n",
    "overlap_active_inactive = set(inactive_gene_id_set).intersection(intrvl_median_tpm05_gene_ids)\n",
    "print(f\"Overlap between median TPM > 0.5 and inactive gene set: {len(overlap_active_inactive)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "42cb69df",
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate bed file of inactive gene, active (median TPM > 0.5) and all genes bed file, collect gene names and gene length\n",
    "inactive_gene_info_dict = {}\n",
    "bed_file_dir = out_dir_path.joinpath(\"bed_files\")\n",
    "bed_file_dir.mkdir(parents=True, exist_ok=True)\n",
    "inactive_genes_bed_path = bed_file_dir.joinpath(\"inactive_genes.bed\")\n",
    "active_genes_bed_path = bed_file_dir.joinpath(\"genes_median_tpm_grtr0.5.bed\")\n",
    "all_genes_bed_path = bed_file_dir.joinpath(\"all_genes.bed\")\n",
    "\n",
    "with open(inactive_genes_bed_path, \"w\") as f_inactive, open(active_genes_bed_path, \"w\") as f_active, open(all_genes_bed_path, \"w\") as f_all:\n",
    "    for gtf in pysam.TabixFile(str(gencode_path)).fetch(parser = pysam.asGTF()):\n",
    "        if gtf.feature == \"gene\":\n",
    "            gene_id = gtf.gene_id.split(\".\")[0]\n",
    "            chrom, start, end, strand = gtf.contig, gtf.start, gtf.end, gtf.strand\n",
    "            # write all genes to bed file \n",
    "            f_all.write(f\"{chrom}\\t{start}\\t{end}\\t{gene_id}\\t0\\t{strand}\\n\")\n",
    "            if gene_id in inactive_gene_id_set:\n",
    "                gene_len = end - start\n",
    "                inactive_gene_info_dict[gene_id] = [gtf.gene_name, gene_len]\n",
    "                # write inactive genes to an additional bed file\n",
    "                f_inactive.write(f\"{chrom}\\t{start}\\t{end}\\t{gene_id}\\t0\\t{strand}\\n\")\n",
    "            if gene_id in intrvl_median_tpm05_gene_ids: \n",
    "                # write active genes to an additional bed file\n",
    "                f_active.write(f\"{chrom}\\t{start}\\t{end}\\t{gene_id}\\t0\\t{strand}\\n\")\n",
    "\n",
    "if set(inactive_gene_info_dict.keys()) != inactive_gene_id_set: \n",
    "    raise ValueError(f\"Inactive genes missing from input gencode: {inactive_gene_id_set - set(gene_id_gene_symbol_dict.keys())}\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "87a85f07",
   "metadata": {},
   "outputs": [],
   "source": [
    "### genomic features \n",
    "\n",
    "## gene length \n",
    "inactive_gene_info_df = pd.DataFrame.from_dict(inactive_gene_info_dict, orient=\"index\", columns=[\"gene_name\", \"gene_length\"])\n",
    "inactive_gene_info_df = inactive_gene_info_df.reset_index().rename(columns={\"index\":\"gene_id\"})\n",
    "\n",
    "## distance to closest gene \n",
    "# load bed files \n",
    "inactive_genes_bed = BedTool(inactive_genes_bed_path)\n",
    "all_genes_bed = BedTool(all_genes_bed_path)\n",
    "\n",
    "# get closest gene distance, -d reports distance, -N requires different names for query and closest \n",
    "gene_gene_distance_str = StringIO(str(inactive_genes_bed.closest(all_genes_bed, d=True, N=True)))\n",
    "gene_gene_dist_cols = {0:\"chrom_a\", 1:\"start_a\", 2:\"end_a\", 3:\"gene_id_a\", 4:\"score\", 5:\"strand\",\n",
    "                       6:\"chrom_b\", 7:\"start_b\", 8:\"end_b\", 9:\"gene_id_b\", 10:\"score\", 11:\"strand\", \n",
    "                      12:\"gene_distance_min\"\n",
    "                      }\n",
    "gene_gene_distance_df = pd.read_csv(gene_gene_distance_str, sep=\"\\t\", header=None).rename(columns=gene_gene_dist_cols)\n",
    "inactive_gene_distance_df = pd.DataFrame(gene_gene_distance_df.groupby(\"gene_id_a\")[\"gene_distance_min\"].min().reset_index()).rename(columns={\"gene_id_a\": \"gene_id\", \"distance\":f\"min_distance\"})\n",
    "\n",
    "# check no identical gene overlaps\n",
    "identical_gene_overlap = gene_gene_distance_df[gene_gene_distance_df[\"gene_id_a\"] == gene_gene_distance_df[\"gene_id_b\"]].shape[0]\n",
    "if identical_gene_overlap != 0: \n",
    "    raise ValueError(\"Identical genes found in dataframe in closest gene dataframe.\")\n",
    "    \n",
    "## gene density \n",
    "# Count number of genes in +/-1Mb window \n",
    "gene_density_window = 1000000\n",
    "gene_gene_density_str = StringIO(str(inactive_genes_bed.window(all_genes_bed, w=gene_density_window)))\n",
    "gene_gene_density_cols = {0:\"chrom_a\", 1:\"start_a\", 2:\"end_a\", 3:\"gene_id_a\", 4:\"score\", 5:\"strand\",  \n",
    "                       6:\"chrom_b\", 7:\"start_b\", 8:\"end_b\", 9:\"gene_id_b\", 10:\"score\", 11:\"strand\"}\n",
    "gene_gene_density_df = pd.read_csv(gene_gene_density_str, sep=\"\\t\", header=None).rename(columns=gene_gene_density_cols)\n",
    "# remove gene duplicates in window \n",
    "gene_gene_density_df = gene_gene_density_df[~(gene_gene_density_df.gene_id_a == gene_gene_density_df.gene_id_b)]\n",
    "\n",
    "# count unique genes in window (avoid duplicate gene IDs)\n",
    "inactive_gene_density_df = pd.DataFrame(gene_gene_density_df.groupby(\"gene_id_a\")[\"gene_id_b\"].nunique().reset_index()).rename(columns={\"gene_id_a\": \"gene_id\", \"gene_id_b\": \"gene_count_1Mb\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "22f28e8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of canonical transcripts with gnomAD metrics: 19704\n",
      "Inactive genes with gnomAD metrics: 2934\n",
      "Number of genes with pHaplo and pTriplo data: 18641\n",
      "Inactive genes with pHaplo and pTriplo data: 2850\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nfs/users/nfs_t/tv5/.conda/envs/tv5_base/lib/python3.7/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed\n",
      "  for idx, row in parser.parse():\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of inactive genes with EDS: 3140\n",
      "Number of genes with episcore: 3092\n"
     ]
    }
   ],
   "source": [
    "### conservation and constraint \n",
    "\n",
    "## constraint\n",
    "#gnomAD \n",
    "gnomad_constraint_df = pd.read_csv(gnomad_constraint_path, sep=\"\\t\")\n",
    "# subset to canonical transcript \n",
    "gnomad_constraint_canon_df = gnomad_constraint_df[gnomad_constraint_df.canonical]\n",
    "print(f\"Number of canonical transcripts with gnomAD metrics: {gnomad_constraint_canon_df.shape[0]}\")\n",
    "gnomad_constraint_canon_metrics_df = gnomad_constraint_canon_df[[\"gene_id\", \"pLI\", \"pNull\", \"pRec\", \"oe_lof_upper\", \"oe_mis_upper\"]]\n",
    "# annotate inactive genes \n",
    "inactive_gene_gnomad_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], gnomad_constraint_canon_metrics_df, on=\"gene_id\", how=\"left\")\n",
    "inactive_gene_with_gnomad_info = inactive_gene_gnomad_df[inactive_gene_gnomad_df.oe_lof_upper.notna()].shape[0]\n",
    "print(f\"Inactive genes with gnomAD metrics: {inactive_gene_with_gnomad_info}\")\n",
    "\n",
    "## pHaplo, pTriplo (Collins et al., 2022)\n",
    "ptriplo_phaplo_full_df = pd.read_excel(ptriplo_phaplo_path).rename(columns={\"Gene\":\"gene_name\"})\n",
    "print(f\"Number of genes with pHaplo and pTriplo data: {ptriplo_phaplo_full_df.shape[0]}\")\n",
    "# subset to pHaplo and pTriplo\n",
    "ptriplo_phaplo_df = ptriplo_phaplo_full_df[[\"gene_name\", \"pHaplo\", \"pTriplo\"]]\n",
    "ptriplo_phaplo_gene_id_df = pd.merge(inactive_gene_info_df[[\"gene_id\", \"gene_name\"]], ptriplo_phaplo_df, on=\"gene_name\", how=\"left\")\n",
    "inactive_gene_with_phalpo_ptriplo_info = ptriplo_phaplo_gene_id_df[ptriplo_phaplo_gene_id_df.pHaplo.notna()].shape[0]\n",
    "print(f\"Inactive genes with pHaplo and pTriplo data: {inactive_gene_with_phalpo_ptriplo_info}\")\n",
    "\n",
    "## EDS (enhancer domain score)\n",
    "# load EDS scores \n",
    "eds_df = pd.read_excel(eds_path).rename(columns={\"GeneSymbol\":\"gene_id\"})\n",
    "# EDS score and information on number of enhancers - proximity and activity linking \n",
    "eds_columns_to_keep = ['gene_id', 'EDS', 'ActivityLinking_Conserved_nt_count',\n",
    "                       'ActivityLinking_nt_count', 'ProximityLinking_Conserved_nt_count',\n",
    "                       'ProximityLinking_nt_count', 'ActivityLinking_EnhancerNumber',\n",
    "                       'ActivityLinking_NumberConservedElements',\n",
    "                       'ProximityLinking_EnhancerNumber','ProximityLinking_NumberConservedElements']\n",
    "gene_features_eds_df = pd.merge(gene_features_df, eds_df, \n",
    "                                on=\"gene_id\", \n",
    "                                how=\"left\")[eds_columns_to_keep].drop_duplicates()\n",
    "inactive_genes_with_eds = gene_features_eds_df[gene_features_eds_df.EDS.notna()].shape[0]\n",
    "print(f\"Number of inactive genes with EDS: {inactive_genes_with_eds}\")\n",
    "\n",
    "## Episcore \n",
    "# load episcore\n",
    "episcore_df = pd.read_excel(episcore_path)\n",
    "# replace header with first row \n",
    "header = episcore_df.iloc[0]\n",
    "episcore_df = episcore_df[1:] \n",
    "episcore_df.columns = header\n",
    "episcore_clean_df = episcore_df[[\"EnsembleID\",\"Episcore\"]].rename(columns={\"EnsembleID\":\"gene_id\"})\n",
    "# annotated inactive gene set \n",
    "gene_features_episcore_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], episcore_clean_df, \n",
    "                                     on=\"gene_id\", \n",
    "                                     how=\"left\").drop_duplicates()\n",
    "gene_features_episcore_df = gene_features_episcore_df.astype({\"Episcore\": float})\n",
    "inactive_gene_with_episcore = gene_features_episcore_df[gene_features_episcore_df.Episcore.notna()].shape[0]\n",
    "print(f\"Number of genes with episcore: {inactive_gene_with_episcore}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "28abd0b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "## conservation \n",
    "# Number of conserved GERP elements per base pair within +/-10kb from gene \n",
    "gerp_elem_window = 10000\n",
    "gerp_elem_bed = BedTool(gerp_elem_path)\n",
    "\n",
    "gerp_elem_intersect_str = StringIO(str(inactive_genes_bed.window(gerp_elem_bed, w=gerp_elem_window)))\n",
    "gerp_elem_intersect_cols = {0:\"chrom_gene\", 1:\"start_gene\", 2:\"end_gene\", 3:\"gene_id\", 4:\"score\", 5:\"strand\",  \n",
    "                            6:\"chrom_elem\", 7:\"start_elem\", 8:\"end_elem\", 9:\"elem_id\"}\n",
    "gerp_elem_intersect_df = pd.read_csv(gerp_elem_intersect_str, sep=\"\\t\", header=None).rename(columns=gerp_elem_intersect_cols)\n",
    "# count number of intersecting elements \n",
    "gerp_elem_count_df = pd.DataFrame(gerp_elem_intersect_df.groupby(\"gene_id\", as_index=False).elem_id.count())\n",
    "# count missing elements\n",
    "gerp_elem_count_df = pd.merge(inactive_gene_info_df[[\"gene_id\", \"gene_length\"]], gerp_elem_count_df, \n",
    "                              on=\"gene_id\", \n",
    "                              how=\"left\").fillna(0)\n",
    "gerp_elem_count_df = gerp_elem_count_df.rename(columns={\"elem_id\":\"gerp_element_count\"})\n",
    "gerp_elem_count_df[\"window_length\"] = gerp_elem_count_df.gene_length + 2 * gerp_elem_window\n",
    "gerp_elem_count_df[\"gerp_element_per_bp\"] = gerp_elem_count_df.gerp_element_count/gerp_elem_count_df.window_length\n",
    "gerp_elem_count_per_bp_df = gerp_elem_count_df[[\"gene_id\", \"gerp_element_per_bp\"]]\n",
    "\n",
    "## mean 100-way phyloP score gene body\n",
    "phylop_gene_body_df = pd.read_csv(phylop_gene_body_path, sep=\"\\t\")\n",
    "gene_mean_phylop_df = phylop_gene_body_df[[\"gene_id\", \"phylop_mean\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "cf786de3",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Gene expression features \n",
    "gene_features_expression_df = inactive_gene_info_df[[\"gene_id\"]].copy()\n",
    "\n",
    "# load median TPM per tissue \n",
    "gtex_median_tpm_tissue_trunc_df = pd.read_csv(gtex_median_tpm_path, sep=\"\\t\").drop(columns=[\"Description\"]).rename(columns={\"Name\":\"gene_id\"})\n",
    "gtex_median_tpm_tissue_trunc_df[\"gene_id\"] = gtex_median_tpm_tissue_trunc_df.gene_id.str.split(\".\").str[0]\n",
    "gtex_median_tpm_tissue_trunc_df = gtex_median_tpm_tissue_trunc_df.set_index(\"gene_id\")\n",
    "\n",
    "## number of tissues gene is expressed \n",
    "tissues_to_drop = ['Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes']\n",
    "gtex_median_tpm_tissue_trunc_df = gtex_median_tpm_tissue_trunc_df.drop(columns=tissues_to_drop)\n",
    "tissue_expression_count_df = pd.DataFrame((gtex_median_tpm_tissue_trunc_df > 0.5).sum(axis=1))\n",
    "tissue_expression_count_df = tissue_expression_count_df.rename(columns={0:\"tissue_expression\"}).reset_index()\n",
    "inactive_tissue_expression_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], \n",
    "                                               tissue_expression_count_df, on=\"gene_id\", how=\"left\")\n",
    "\n",
    "## expression by tissue\n",
    "# collect tissues and associated sub-tissues \n",
    "gtex_organs_to_tissue_dict = {}\n",
    "for column in gtex_median_tpm_tissue_trunc_df.columns: \n",
    "    organ_name = column.split(\" - \")[0]\n",
    "    gtex_organs_to_tissue_dict.setdefault(organ_name,[]).append(column)\n",
    "# remove blood and cells \n",
    "for organ in [\"Cells\", \"Whole Blood\"]:\n",
    "    gtex_organs_to_tissue_dict.pop(organ, None)\n",
    "# if median TPM > 0.5 in any sub-tissue then gene expressed in tissue\n",
    "for organ in gtex_organs_to_tissue_dict.keys():\n",
    "    tissues = gtex_organs_to_tissue_dict[organ]\n",
    "    gtex_tpm_organ_df = gtex_median_tpm_tissue_trunc_df[tissues].copy()\n",
    "    #categorical variable for gene expression, TPM > 0.5\n",
    "    gene_tpm_binary_df = pd.DataFrame((gtex_tpm_organ_df > 0.5).any(axis=1).astype(int)).rename(columns={0:f\"{organ.lower()}_expression\"})\n",
    "    gene_features_expression_df = pd.merge(gene_features_expression_df, gene_tpm_binary_df, on=\"gene_id\", how=\"left\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ffc44914",
   "metadata": {},
   "outputs": [],
   "source": [
    "## distance to nearest active gene \n",
    "active_genes_bed = BedTool(active_genes_bed_path)\n",
    "active_gene_distance_str = StringIO(str(inactive_genes_bed.closest(active_genes_bed, d=True, N=True)))\n",
    "active_gene_distance_df = pd.read_csv(active_gene_distance_str, sep=\"\\t\", header=None).rename(columns=gene_gene_dist_cols)\n",
    "active_gene_to_gene_distance_df = pd.DataFrame(active_gene_distance_df.groupby(\"gene_id_a\")[\"gene_distance_min\"].min().reset_index()).rename(columns={\"gene_id_a\": \"gene_id\", \"gene_distance_min\":f\"active_gene_distance\"})\n",
    "\n",
    "# check no identical gene overlaps\n",
    "identical_gene_overlap = active_gene_distance_df[active_gene_distance_df[\"gene_id_a\"] == active_gene_distance_df[\"gene_id_b\"]].shape[0]\n",
    "if identical_gene_overlap != 0: \n",
    "    raise ValueError(\"Identical genes found in dataframe in closest gene dataframe.\")\n",
    "# subset to inactive genes \n",
    "inactive_gene_active_distance_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], active_gene_to_gene_distance_df, on=\"gene_id\", how=\"left\")\n",
    "\n",
    "## active gene density \n",
    "# Count number of genes in +/-1Mb window \n",
    "active_gene_gene_density_str = StringIO(str(inactive_genes_bed.window(active_genes_bed, w=gene_density_window)))\n",
    "active_gene_gene_density_df = pd.read_csv(active_gene_gene_density_str, sep=\"\\t\", header=None).rename(columns=gene_gene_density_cols)\n",
    "# remove gene duplicates in window \n",
    "active_gene_gene_density_df = active_gene_gene_density_df[~(active_gene_gene_density_df.gene_id_a == active_gene_gene_density_df.gene_id_b)]\n",
    "# count unique genes in window (avoid duplicate gene IDs)\n",
    "active_gene_density_df = pd.DataFrame(active_gene_gene_density_df.groupby(\"gene_id_a\")[\"gene_id_b\"].nunique().reset_index()).rename(columns={\"gene_id_a\": \"gene_id\", \"gene_id_b\": \"active_gene_count_1Mb\"})\n",
    "# fill in missing genes\n",
    "active_gene_density_all_genes_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], active_gene_density_df, on=\"gene_id\", how=\"left\").fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fee0ed0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "### regulatory features \n",
    "\n",
    "## fraction of gene body overlapping chromatin state in PBMCs\n",
    "chromhmm_fraction_df = pd.read_csv(chromhmm_fraction_path)\n",
    "chromhmm_fraction_df[\"gene_id\"] = chromhmm_fraction_df.gene_id.str.split(\".\").str[0]\n",
    "chromhmm_fraction_inactive_df = pd.merge(inactive_gene_info_df[[\"gene_id\"]], chromhmm_fraction_df, on=\"gene_id\", how=\"left\")\n",
    "\n",
    "chromhmm_15states = ['1_TssA', '2_TssAFlnk', '3_TxFlnk', '4_Tx', '5_TxWk', '6_EnhG', '7_Enh', '8_ZNF/Rpts', '9_Het', \n",
    "                     '10_TssBiv', '11_BivFlnk', '12_EnhBiv', '13_ReprPC', '14_ReprPCWk', '15_Quies']\n",
    "chrom_col_replace = {col:f\"gene_fraction_{col.split('_')[1].replace('/', '')}\" for col in chromhmm_15states} \n",
    "chromhmm_fraction_inactive_df = chromhmm_fraction_inactive_df.rename(columns=chrom_col_replace)\n",
    "\n",
    "# fraction of gene body overlap A/B compartment in GM12878\n",
    "inactive_gene_bed = BedTool(inactive_genes_bed_path)\n",
    "gm12878_ab_bed = BedTool(gm12878_ab_compartments_path)\n",
    "inactive_gene_intersect_ab_str = StringIO(str(inactive_gene_bed.intersect(gm12878_ab_bed, wo=True)))\n",
    "gene_ab_cols = {0:\"chrom_gene\", 1:\"start_gene\", 2:\"end_gene\", 3:\"gene_id\", 4:\"score\", 5:\"strand\", \n",
    "                6: 'compartment_chrom', 7: 'compartment_start', 8: 'compartment_end',\n",
    "                9: 'compartment_score', 10: 'overlap'}\n",
    "inactive_gene_intersect_ab_df = pd.read_csv(inactive_gene_intersect_ab_str, sep=\"\\t\", header=None).rename(columns=gene_ab_cols)\n",
    "# label compartment types \n",
    "# A-compartment: > 0 and B-compartment < 0\n",
    "conditions = [(inactive_gene_intersect_ab_df.compartment_score >= 0) & (~inactive_gene_intersect_ab_df.compartment_score.isnull()), \n",
    "              (inactive_gene_intersect_ab_df.compartment_score < 0) & (~inactive_gene_intersect_ab_df.compartment_score.isnull()), \n",
    "              (inactive_gene_intersect_ab_df.compartment_score.isnull())]\n",
    "\n",
    "values = [\"A\", \"B\", \"Unassigned\"]\n",
    "inactive_gene_intersect_ab_df[\"compartment_type\"] = np.select(conditions, values)\n",
    "\n",
    "gene_compartment_overlap = {}\n",
    "for i, gene_id in enumerate(inactive_gene_intersect_ab_df.gene_id.unique()):\n",
    "    gene_compartment_overlap[i] = [gene_id]\n",
    "    for compartment in [\"A\", \"B\", \"Unassigned\"]:\n",
    "        total_overlap = inactive_gene_intersect_ab_df[(inactive_gene_intersect_ab_df.gene_id == gene_id) & \n",
    "                                                      (inactive_gene_intersect_ab_df.compartment_type == compartment)\n",
    "                                                     ].overlap.sum()\n",
    "        gene_compartment_overlap[i].append(total_overlap)\n",
    "gene_compartment_overlap_df = pd.DataFrame.from_dict(gene_compartment_overlap, \n",
    "                                                     orient=\"index\", \n",
    "                                                     columns=[\"gene_id\", \"overlap_A\", \"overlap_B\", \"overlap_unassigned\"])\n",
    "gene_compartment_overlap_df = pd.merge(inactive_gene_info_df[[\"gene_id\", \"gene_length\"]], \n",
    "                                       gene_compartment_overlap_df, \n",
    "                                       on=\"gene_id\", \n",
    "                                       how=\"inner\"\n",
    "                                      )\n",
    "gene_compartment_overlap_df[\"fraction_A\"] = gene_compartment_overlap_df.overlap_A / gene_compartment_overlap_df.gene_length\n",
    "gene_compartment_overlap_df[\"fraction_B\"] = gene_compartment_overlap_df.overlap_B / gene_compartment_overlap_df.gene_length\n",
    "gene_compartment_overlap_df[\"fraction_unassigned\"] = gene_compartment_overlap_df.overlap_unassigned / gene_compartment_overlap_df.gene_length\n",
    "gene_compartment_overlap_df = gene_compartment_overlap_df[[\"gene_id\", \"fraction_A\", \"fraction_B\", \"fraction_unassigned\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "26e9b728",
   "metadata": {},
   "outputs": [],
   "source": [
    "## TAD boundary distance \n",
    "tad_boundaries_bed = BedTool(tad_boundaries_path)\n",
    "inactive_gene_tad_distance_str = StringIO(str(inactive_gene_bed.closest(tad_boundaries_bed, d=True)))\n",
    "gene_tad_distance_cols = {0:\"chrom\", 1:\"start\", 2:\"end\", 3:\"gene_id\", 4:\"score\", 5:\"strand\",\n",
    "                          6:\"chrom_tad\", 7:\"start_tad\", 8:\"end_tad\", 9:\"tad_strength\", 10:\"tad_score\", \n",
    "                          11:\"tad_distance\"\n",
    "                         }\n",
    "inactive_gene_tad_distance_df = pd.read_csv(inactive_gene_tad_distance_str, sep=\"\\t\", header=None).rename(columns=gene_tad_distance_cols)\n",
    "inactive_gene_tad_distance_min_df = inactive_gene_tad_distance_df.groupby(\"gene_id\", as_index=False).tad_distance.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "109d6b84",
   "metadata": {},
   "source": [
    "### Gene Sets "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7bbda56a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# olfactory, autosomal dominant, haploinsufficient and autosomal recessive genes from gnomAD\n",
    "gnomad_gene_set_df = pd.read_csv(gnomad_gene_sets_path, sep=\"\\t\")\n",
    "inactive_gene_gnomad_sets_df = inactive_gene_info_df[[\"gene_id\", \"gene_name\"]].copy()\n",
    "for gnomad_gene_set in gnomad_gene_set_df.gene_list.unique():\n",
    "    gene_set = gnomad_gene_set_df[gnomad_gene_set_df.gene_list == gnomad_gene_set].gene.unique()\n",
    "    inactive_gene_gnomad_sets_df[f\"gnomad_{gnomad_gene_set.lower().replace(' ', '_')}\"] = np.where(inactive_gene_info_df.gene_name.isin(gene_set), 1, 0)\n",
    "inactive_gene_gnomad_sets_df = inactive_gene_gnomad_sets_df.drop(columns=[\"gene_name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f69e00ef",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of oncogenes in list: 245\n"
     ]
    }
   ],
   "source": [
    "# oncogenes, cosmic v97, downloaded Cancer Gene Census \n",
    "cosmic_cancer_gene_df = pd.read_csv(cosmic_genes_path)\n",
    "# select Tier 1, dominant oncogenes\n",
    "oncogenes_df = cosmic_cancer_gene_df[(cosmic_cancer_gene_df[\"Tier\"] == 1) &\n",
    "                                     (cosmic_cancer_gene_df[\"Molecular Genetics\"] == \"Dom\") & \n",
    "                                     (cosmic_cancer_gene_df[\"Role in Cancer\"].str.contains(\"oncogene\"))\n",
    "                                     ]\n",
    "# get ENSG IDs\n",
    "oncogene_gene_id_list = []\n",
    "for synonyms in oncogenes_df.Synonyms.tolist(): \n",
    "    if \"ENSG\" in synonyms: \n",
    "        for synonym in synonyms.split(\",\"):\n",
    "            if synonym.startswith(\"ENSG\"): \n",
    "                oncogene_gene_id_list.append(synonym.split(\".\")[0])\n",
    "    else: \n",
    "        oncogene_gene_id_list.append(np.nan)\n",
    "print(f\"Number of oncogenes in list: {len(oncogene_gene_id_list)}\")\n",
    "oncogene_info_df = inactive_gene_info_df[[\"gene_id\"]].copy()\n",
    "oncogene_info_df[\"oncogene\"] = np.where(oncogene_info_df.gene_id.isin(oncogene_gene_id_list), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "af222164",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Decipher genes - curated list of genes associated with developmental disorders \n",
    "# https://www.deciphergenomics.org/ddd/ddgenes \n",
    "decipher_genes_df = pd.read_csv(decipher_genes_path)\n",
    "# limit to strong, definitive and moderate \n",
    "decipher_genes_conf_df = decipher_genes_df[decipher_genes_df[\"confidence category\"].isin([\"strong\", \"definitive\", \"moderate\"])]\n",
    "decipher_genes = decipher_genes_conf_df[\"gene symbol\"].unique()\n",
    "decipher_genes_df = inactive_gene_info_df[[\"gene_name\", \"gene_id\"]].copy()\n",
    "decipher_genes_df[\"decipher_gene\"] = np.where(decipher_genes_df.gene_name.isin(decipher_genes), 1, 0)\n",
    "decipher_genes_df = decipher_genes_df.drop(columns=[\"gene_name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "94020201",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of approved drug targets: 929\n",
      "Number of approved targets in inactive gene set: 213\n"
     ]
    }
   ],
   "source": [
    "# OpenTargets approved drugs \n",
    "approved_targets = pd.read_csv(approved_drug_targets_path, header=None)[0].tolist()\n",
    "print(f\"Number of approved drug targets: {len(approved_targets)}\")\n",
    "inactive_approved_targets_df = inactive_gene_info_df[[\"gene_id\"]].copy()\n",
    "inactive_approved_targets_df[\"approved_target\"] = np.where(inactive_approved_targets_df.gene_id.isin(approved_targets), 1, 0)\n",
    "inactive_gene_approved_target = inactive_approved_targets_df[inactive_approved_targets_df.approved_target == 1].shape[0]\n",
    "print(f\"Number of approved targets in inactive gene set: {inactive_gene_approved_target}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "d4df10de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of OMIM genes: 38458\n"
     ]
    }
   ],
   "source": [
    "# OMIM genes \n",
    "omim_genes_list = []\n",
    "with open(omim_genes_path, 'r') as f_in: \n",
    "    for line in f_in: \n",
    "        if line.startswith(\"#\"):\n",
    "            continue \n",
    "        else: \n",
    "            genes = line.split(\"\\t\")[6]\n",
    "            for gene in genes.split(\", \"): \n",
    "                omim_genes_list.append(gene)\n",
    "omim_genes_set = set(omim_genes_list)\n",
    "print(f\"Number of OMIM genes: {len(omim_genes_set)}\")\n",
    "inactive_omim_df = inactive_gene_info_df[[\"gene_id\", \"gene_name\"]].copy()\n",
    "inactive_omim_df[\"omim_gene\"] = np.where(inactive_omim_df.gene_name.isin(omim_genes_set), 1, 0)\n",
    "inactive_omim_df = inactive_omim_df.drop(columns=[\"gene_name\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "f00bab17",
   "metadata": {},
   "outputs": [],
   "source": [
    "# protein-coding and lncRNA annotation\n",
    "# collect all gene types from Gencode \n",
    "gene_types, count = {}, 0\n",
    "for gtf in pysam.TabixFile(str(gencode_path)).fetch(parser = pysam.asGTF()):\n",
    "    if gtf.feature == \"gene\": \n",
    "        gene_id = gtf.gene_id.split(\".\")[0]\n",
    "        gene_type = gtf.gene_type \n",
    "        gene_types[count] = [gene_id, gene_type]\n",
    "        count += 1\n",
    "gene_types_df = pd.DataFrame.from_dict(gene_types, orient=\"index\", columns=[\"gene_id\", \"gene_type\"])\n",
    "gene_types_df[\"protein_coding\"] = np.where(gene_types_df.gene_type == \"protein_coding\", 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "541ee168",
   "metadata": {},
   "source": [
    "### Merge features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "a057be43",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_df_list = [inactive_gene_info_df[[\"gene_id\", \"gene_length\"]],\n",
    "                    inactive_gene_density_df, \n",
    "                    inactive_gene_distance_df, \n",
    "                    inactive_tissue_expression_df, \n",
    "                    gene_features_expression_df,\n",
    "                    inactive_gene_active_distance_df, \n",
    "                    active_gene_density_all_genes_df, \n",
    "                    chromhmm_fraction_inactive_df,\n",
    "                    gene_compartment_overlap_df, \n",
    "                    inactive_gene_gnomad_sets_df, \n",
    "                    oncogene_info_df,\n",
    "                    gene_features_eds_df, \n",
    "                    gene_features_episcore_df,\n",
    "                    ptriplo_phaplo_gene_id_df[[\"gene_id\", \"pHaplo\", \"pTriplo\"]],\n",
    "                    inactive_gene_gnomad_df, \n",
    "                    gerp_elem_count_per_bp_df, \n",
    "                    inactive_gene_tad_distance_min_df, \n",
    "                    gene_mean_phylop_df, \n",
    "                    inactive_approved_targets_df, \n",
    "                    decipher_genes_df, \n",
    "                    inactive_omim_df, \n",
    "                    gene_types_df\n",
    "                   ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "1a3d6e92",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge features \n",
    "gene_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on=\"gene_id\",\n",
    "                                                              how='inner'), feature_df_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "29707dae-350b-4631-85b1-b8cdc396a47b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# rename features for statsmodels \n",
    "adj_colnames = {col:col.replace(\" \", \"_\").replace(\"\\n\",\"\").replace(\"-\",\"\").replace(\".\",\"\") for col in gene_features_merged_df.columns}\n",
    "gene_features_merged_adj_colnames_df = gene_features_merged_df.rename(columns=adj_colnames)\n",
    "gene_features_merged_adj_colnames_df.to_csv(out_dir_path.joinpath(f\"inactive_gene_features_{len(inactive_gene_id_set)}.csv\"), index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
